######################################################################################################                                                                          
# Description: Random intercept and random slope logistic mixed model with unspecified random effects (RE)
#			   distribution.      
# Application: HILDA dataset with time as (wave - 1)/10  and  Age at Wave 1 as (Age at Baseline - 30)/10                                                   
# Notes: For the estimation of this model we use a 2-step procedure:                                  
#        (1) We estimate the RE distribution using the VEM algorithm.                                 
#            In this model estimating both the intercept betas.0 of the model and the mean of the RE  
#            distribution is not identified. Thus, betas.0 is kept fixed through the iterative        
#            procedure and at the end it is updated as betas.0 = betas.0 + mean.b, where mean.b       
#            is the mean of the RE distribution, i.e.,  mean.b = sum (pis * gridp)                    
#        (2) The model parameters \betas are estimated using Newton-Raphson.                                     
#                                                                                                                                                                                                
# Programmers : R. Tsonaka and L. Marquart                                                                            
######################################################################################################


## Read in the data:
dat_All<- read.table("HILDA.csv", header=TRUE, sep=",")

 ##################################################################################################################################
 ## Summary of variables in dataset (see STATA do file for more information on how the variables were created):
 ## XWAVEID: unique subject ID
 ## employment_2cat: Binary variable for employment status 
 ## WAVE: Wave of the HILDA panel survey 
 ## AgeBaseline: Age of subject at first wave observed in the study
 ## maritalStat3cat1: Indicator variable of whether subject is seperated, divorced or widowed at that wave
 ## maritalStat3cat2: Indicator variable of whether subject is single at that wave 
 ## BaselineEduc12: Indicator variable of whether subject has Year 12, a diploma or a certificate as the highest education
 ## BaselineEduc3: Indicator variable of whether subject has Year 11 or less as the highest education
 ## DependChild1: Indicator variable of whether subject has any dependent children with the youngest < 5 years of age
 ## DependChild2: Indicator variable of whether subject has any dependent children with the youngest aged between 5-24 years of age
 ##################################################################################################################################

## Remove any time-points for a subject that have missing covariate data
dat<-dat_All[complete.cases(dat_All[,205:216]),]

## Total sample size and other parameters
n <- length(unique(dat$XWAVEID)) # sample size unique number of subjects



## Set up time and age at baseline variables
dat$waveMinus1Per10<-(dat$WAVE - 1)/10
dat$AgeBaseline_Minus30Per10<-(dat$AgeBaseline-30)/10

## Define the model matrix (Wave, Age at Baseline, Marital Status (2 categories); Age at Baseline (2 categories) and Dependent Children (2 categories))
X <- model.matrix(~  waveMinus1Per10 + AgeBaseline_Minus30Per10 + maritalStat3cat1 + maritalStat3cat2 + BaselineEduc12 + BaselineEduc3 + DependChild1 + DependChild2 , data = dat)
dim(X)

colnames(X) <- c("Intercept",  "waveMinus1Per10", "AgeBaseline_Minus30Per10", "maritalStat3cat1", "maritalStat3cat2", "BaselineEduc12", "BaselineEduc3", "DependChild1","DependChild2")
N <- nrow(X)

## Define ID variable (xwaveid) and binary outcome variable (Employment status)
id.y <- dat$XWAVEID
y <- dat$employment_2cat


## Descriptives + Initial values (from a logistic model))
table(y)

options(na.action='na.omit')
## Fit ordinary logistic model (ignoring clustering) to obtain initial values
mod.glm <- glm(y ~  waveMinus1Per10 + AgeBaseline_Minus30Per10  + maritalStat3cat1 + maritalStat3cat2 + BaselineEduc12 + BaselineEduc3 + DependChild1 + DependChild2, family = binomial, data = dat)

## Parameters for the measurement model
betas <- coef(mod.glm) # initial values 

## Stopping rules
tol <- sqrt(.Machine$double.eps)         # Tolerance to exclude support points with almost 0 weight
tol.cd <- 1e-04                          # Tolerance for the numerical derivative used in the estimation of \betas and \sig
eps.logLik <- 1e-07                      # Stopping rule for the log-likelihood function
eps.grad <- 1e-03                        # Stopping rule for the gradient function
iter <- 4000                             # Maximum number of iterations


############################################################
## Functions required
############################################################

## Function to estimate the step length 'a'
chang.weight <- function (step, pis) {
    pis.new <- pis
    pis.new[ind.min] <- (1 - step) * pis[ind.min]
    pis.new[ind.max] <- pis[ind.max] + step * pis[ind.min]
    fy.b <- exp(rowsum(dbinom(y, prob = plogis(cmu), size = 1, log = TRUE), id.y))
    fy <- as.vector(fy.b %*% pis.new)
    sum(log(fy)) - lgLik
}

## Numerical derivative - central difference - for scalars
cd <- function (x, f, ..., eps = 1e-04) {
    n <- length(x)
    res <- numeric(n)
    ex <- pmax(abs(x), 1)
    for (i in 1:n) {
        x1 <- x2 <- x
        x1[i] <- x[i] + eps * ex[i]
        x2[i] <- x[i] - eps * ex[i]
        diff.f <- c(f(x1, ...) - f(x2, ...))
        diff.x <- 2 * max(abs(c(x1[i] - x[i], x2[i] - x[i])))
        res[i] <- diff.f / diff.x
    }
    res
}

## The -log-likelihood as a function of \betas, \sig used in Newton-Raphson
fun.betas <- function(thet){
    betas <- thet 
    mu <- c(as.matrix(Z)%*%betas.01 + as.matrix(X) %*% betas)
    cmu <- mu + grid.random
    fy.b <- exp(rowsum(dbinom(y, prob = plogis(cmu), size = 1, log = TRUE), id.y))
    fy <- as.vector(fy.b %*% pis)
    -sum(log(fy))
}

gr.betas <- function(thet){            
    cd(thet, f = fun.betas, eps = tol.cd)
} 

#########################################################

### Logistic regression with random intercepts and slopes: creating the Z design matrix and X design matrix, 
##														   storing coefficient estimates related to random effects,
##														   defining coefficents to be estimated.

Z<-X[,c(1,2)]								  ##Create the Z design matrix
X <- X[, -c(1,2), drop = FALSE]               ##Remove the intercept and the slope from the design matrix

betas.0 <- betas[1]       					 ## The intercept beta.0 is kept fixed at the true value
betas.1 <- betas[2]							 ## The slope beta.1 is kept fixed at the true value
betas.01<-betas[c(1,2)]      				 ## The coefficients of the random effects are kept at the true value
betas <- betas[-c(1,2)]        				 ## The remaining betas are to be estimated


##Calculate the cholesky decomposition - using the assumed normal RE random intercept and slope logistic model fitted to the data 
##Have fit the model to the data and the following RE covariance was estimated:
RECov<-matrix(c(17.569,-5.2038,-5.2038,26.7565), nrow=2)
chmat<-chol(RECov)
chmatInv<-solve(chmat)


## Create grid to be used for VEM to estimate the RE distribution
grid.b0min <- -7      # The lower bound of the grid for the scaled random intercept
grid.b0max <-  7      # The upper bound of the grid for the scaled random intercept
grid.b1min <- -7      # The lower bound of the grid for the scaled random slope
grid.b1max <-  7      # The upper bound of the grid for the scaled random slope
K <- 21    			  # Number of support points in each dimension of the grid
gridp.b0 <- seq(grid.b0min, grid.b0max, length.out = K)
gridp.b1 <- seq(grid.b1min, grid.b1max, length.out = K)

gridp_qdim_scaled<-cbind(rep(gridp.b0,K),rep(gridp.b1 ,each=K)) ##Define the grid for the scaled random effects

gridp_qdim<-chmat%*%t(gridp_qdim_scaled) ##Back-transform the grid to the original scale by using the Cholesky decomposition

pis <- rep(1/(K*K), (K*K))				 ## Initial values for the probability weights
M<-K*K									 ## Total number of grid points


log.Lik <- numeric(iter)

for (it in 1:iter) {
    mu <- c(as.matrix(Z)%*%betas.01 + as.matrix(X) %*% betas) 	## The fixed effects part of the linear predictor
 
	grid.random<-as.matrix(Z)%*%as.matrix(gridp_qdim) 
    cmu <- mu + grid.random
    fy.b <- exp(rowsum(dbinom(y, prob = plogis(cmu), size = 1, log = TRUE), id.y))
    fy <- as.vector(fy.b %*% pis)
    log.Lik[it] <- lgLik <- sum(log(fy))
    cat("\niter:", it, "\tlogLik:", lgLik)
    if(it > 1) {diff.LgLik <- log.Lik[it] - log.Lik[it - 1]
        cat("\nDif:", diff.LgLik)
    }
    if (it > 1 && (diff.LgLik < 0))
        stop("\nlog-likelihood failed to increase.")
    if (it > 1 && (diff.LgLik < eps.logLik * (abs(log.Lik[it - 1]) + eps.logLik) & max(grad) < 1 + eps.grad)) {
        cat("\nconverged!")
        break
        }
  ############################	
  #### Step 1: VEM step   ####
  ############################
    grad <- colMeans(fy.b / c(fy))    ##Calculate the gradient
    ind.min <- which.min(grad)		  ##Identify the support point that corresponds with the minimum gradient 
    ind.max <- which.max(grad)		  ##Identify the support point that corresponds with the maximum gradient 
    cat("\nMin:", ind.min, "\tMax:", ind.max)
    cat("\nGradMin:", grad[ind.min], "\tGradMax:", grad[ind.max])
	
	## Determine optimal step length within the VEM step
    a <- if((opt <- chang.weight(1, pis = pis)) > 0) {
        cat("\nIncrease:", opt)
            1 
        } else {
    opt <- optimize(chang.weight, interval = c(0, 1), maximum = TRUE, pis = pis)
    cat("\nIncrease:", opt$objective)
    a <- opt$maximum
    }
    cat("\nalpha:", a)   
    pis.new <- pis
    pis.new[ind.min] <- (1 - a) * pis[ind.min]
    pis.new[ind.max] <- pis[ind.max] + a * pis[ind.min]
    pis <- pis.new
    ind.keep <- pis > tol			
    pis <- pis[ind.keep]				## Removes any support points with negligable probability weight (i.e. those < tol)
    gridp_qdim <- gridp_qdim[,ind.keep]
    M <-  dim(t(gridp_qdim))[[1]]

    cat("\nPoints:", M)
 
    mub <- colSums(t(gridp_qdim )* pis)  ##Estimated means of the random effects (i.e. random intercept and random slope)
    cat("\nMeanb:", mub)
 	grid.random<-as.matrix(Z)%*%as.matrix(gridp_qdim)  
    
	#######################################
	### Step 2: Estimate sig and betas ###
	#######################################
    ## The Newton-Raphson step  
    opt.betas <- optim(c(betas), fn = fun.betas, gr = gr.betas, method = "BFGS", control = list(trace = 100, maxit = if (it < 100) 3 else 10))
    betas <- opt.betas$par
    cat("\nbetas:", betas)

}

## Estimate mean of the random effects and estimate the variance-covariance matrix of the random effects
mu.b <- colSums(t(gridp_qdim) * pis)
var.matrix <- t(t(gridp_qdim) - rep(mu.b, each = M)) %*% ((t(gridp_qdim) - rep(mu.b, each = M)) * pis)

mu.b
var.matrix




##############################################################
# To obtain Standard Errors for the Regression Parameters
##############################################################
opt.betas <- optim(c(betas), fn = fun.betas, gr = gr.betas, method = "BFGS", 
                        control = list(trace = 100, maxit = if (it < 100) 3 else 10), hessian = TRUE)

std.errors <- sqrt(diag(solve(opt.betas$hessian)))
names(std.errors) <- c("AgeBaseline_Minus30Per10", "maritalStat3cat1", "maritalStat3cat2", "BaselineEduc12", "BaselineEduc3", "DependChild1","DependChild2")

betas_F <- c(betas.01 + mu.b, betas) # The final \betas. After convergence, the intercept and the coefficent of wave is updated by adding the mean of the RE
cat("\nFinal betas:", betas_F)
cat("\nStandard errors:", c(NA, NA,std.errors))


## Estimate of the random effects discrete distribution (Original grid adjusted for the estimated means of the random effects)
ResultsRandomEffect<-cbind(t(gridp_qdim)[,1]-mu.b[1],t(gridp_qdim)[,2]-mu.b[2] ,pis,grad)
colnames(ResultsRandomEffect)<-c("b0","b1","pis","gradient")
ResultsRandomEffect



 